---
title: "Appendix Tables and Figures"
output:
  github_document:
    toc: true
editor_options: 
  chunk_output_type: console
always_allow_html: true
---

```{r setup, include=FALSE}
# code chunk settings
knitr::opts_chunk$set(echo = TRUE,
                      warning = FALSE,
                      message = FALSE,
                      fig.width = 12,
                      fig.align = 'center')

usethis::use_blank_slate()

### ----------- packages + settings
pacman::p_load(tidyverse,
               lubridate,
               #stats
               survey,
               sandwich,
               lmtest,
               fastDummies,
               # figs and tables
               ggimage,
               patchwork,
               AMR,   
               waffle,
               cowplot,
               zoo,
               pscl,
               kableExtra,
               texreg)
```


```{r data}
### ----------- data
# survey + extension data for all participants
merged_data <- read_rds("data/yg_browser_cces_merged.rds")
# activity data
activity_data <- read_rds("data/activity_yg_cces.rds") 
# browser history data
browser_history_data <-read_rds("data/browser_history_yg_cces.rds")

# over time activity data, averages by days and channel type
day_count_averages <- read_csv( "data/day_count_averages.csv")
day_time_averages <- read_csv( "data/day_time_averages.csv")

# referrers data (see build.R for construction of this table)
on_platform_referrers_by_channel <- 
  read_csv('data/on_platform_referrers_by_channel_wtd.csv')

# recommendations data
recs_data <- 
  read_delim("data/recommendation_pipeline_wtd.tsv", delim = '\t')

# color palette 
color_palette <- c("#FFA500", "#CD5C5C", "#015CB9", "#E3E6E6")
```

```{r helper functions, include = TRUE}
source("helper_fxns.R") # these are mostly for plotting and presentation
```

## Table S1: Demographic statistics by sample

```{r calculate demog distribution}
# select variables of interest
demo_vars <- c(
  'gender',
  'race',
  'presvote16post',
  'employ',
  'educ2',
  'religion',
  'marital',
  'faminc_new',
  'pid3',
  'age_group'
)
demographics_data <- merged_data %>%
  dplyr::select(weight_cmd, all_of(demo_vars), browser_sample) %>%
  fastDummies::dummy_cols(select_columns = all_of(demo_vars))
# level of each variable
var_levels <- demographics_data %>% 
  dplyr::select(-demo_vars, -weight_cmd, -browser_sample) %>% 
  names()

# function to calculate (un)weighted stats depending on sample
weighted_stats_fxn <- function (var, sample) {
  require(TAM)
  require(diagis)
  if (sample == "full") {
    d <- # take non-null in variable
      demographics_data[complete.cases(demographics_data[[var]], demographics_data$weight_cmd), , drop =
                          FALSE][[var]]
    w <-
      demographics_data$weight_cmd[complete.cases(demographics_data[[var]],  demographics_data$weight_cmd)]
  } else if (sample == "browser") {
    d <- # non-Nulls in extension sample
      demographics_data[complete.cases(demographics_data[[var]], demographics_data$weight_cmd) &
                          demographics_data$browser_sample == "browser", , drop = FALSE][[var]]
    w <-
      demographics_data$weight_cmd[complete.cases(demographics_data[[var]],  demographics_data$weight_cmd) &
                                     demographics_data$browser_sample == "browser"]
  }
  out <- data.frame(
    "variable" = str_split(string = var, pattern = "_(?=[^_]+$)")[[1]][1] ,
    "level" = str_split(string = var, pattern = "_(?=[^_]+$)")[[1]][2] ,
    "n" = length(d),
    "unweighted_mean" = mean(d, na.rm = T),
    "weighted_mean" = TAM::weighted_mean(d, w = w),
    "unweighted_se" = sd(d, na.rm = T) / sqrt(length(d)),
    "weighted_se" = diagis::weighted_se(d, w = w, na.rm = TRUE),
    "sample" = sample
  )
  return(out)
}

# function to format table
custom_kable_weighted <- function(data, 
                                  col_names = c("", "Weighted", "Unweighted")) {
  kbl(
    data %>% dplyr::select(-variable),
    booktabs = TRUE,
    caption = "Full and extension sample demographics",
    format = "html",
    align = "c",
    digits = 2,
    linesep = "",
    col.names = col_names
  )  %>%
    kable_styling(latex_options =  "HOLD_position",
                  font_size = 10,
                  #bootstrap_options = c("striped","condensed"),
                  stripe_index = c(1, 5, 13, 17, 21, 29, 35, 41, 51, 57)) %>%
    group_rows("Gender", 1, 4,
               background = "gray!6",
               latex_gap_space = "0em") %>%
    group_rows("Race", 5, 8,
               background = "gray!6",
               latex_gap_space = "0em") %>%
    group_rows("2016 presidential vote", 9, 12,
               background = "gray!6",
               latex_gap_space = "0em") %>%
    group_rows("Employment status", 13, 16,
               background = "gray!6",
               latex_gap_space = "0em") %>%
    group_rows("Education", 16, 24,
               background = "gray!6",
               latex_gap_space = "0em") %>%
    group_rows("Religion", 25, 30,
               background = "gray!6",
               latex_gap_space = "0em") %>%
    group_rows("Marital status", 31, 36,
               background = "gray!6",
               latex_gap_space = "0em") %>%
    group_rows("Family income", 37, 46,
               background = "gray!6",
               latex_gap_space = "0em") %>%
    group_rows("Party identification", 47, 52,
               background = "gray!6",
               latex_gap_space = "0em") %>%
    group_rows("Age", 53, 60,
               background = "gray!6",
               latex_gap_space = "0em") %>% 
    footnote(
      c("Weighted estimates use YouGov survey weights.",
        "Standard errors are in parentheses."))%>%
    row_spec(61, extra_latex_after = "\\cline{3-4}")%>%
    row_spec(62, hline_after = TRUE)
}
```

```{r}
# get (un)weighted estimates for extension sample
browser_sample <-
  map(var_levels, ~weighted_stats_fxn(.x, sample = "browser"))
# get (un)weighted estimates for full survey sample
full_sample <-
  map(var_levels, ~weighted_stats_fxn(.x, sample = "full"))

# merge full and extension estimates together
demographic_stats_table <- browser_sample %>%
  bind_rows() %>%
  left_join(
    bind_rows(full_sample),
    by = c("variable", "level"),
    suffix = c(".browser", ".full")
  ) %>%
  mutate_at(
    vars(
      starts_with("unweighted_"),
      starts_with("weighted_")
    ),
    list( ~ format(round(., 2), nsmall = 2) )
  )

# number of respondents in survey sample
n_full <- 
  demographics_data %>% 
  nrow()
# number of respondents in extension sample
n_browser <- 
  demographics_data %>% 
  filter(browser_sample == "browser") %>% 
  nrow()
```


```{r demog table}
# reformat 
full_summ <- demographic_stats_table %>%
  dplyr::select(
    variable,
    level,
    weighted_mean.full,
    weighted_se.full,
    unweighted_mean.full,
    unweighted_se.full
  ) %>%
  mutate(
    unweighted_se.full = paste0("(", as.character(unweighted_se.full), ")"),
    weighted_se.full = paste0("(", as.character(weighted_se.full), ")"),
    weighted_mean.full = as.character(weighted_mean.full),
    unweighted_mean.full  = as.character(unweighted_mean.full)
  ) %>%
  pivot_longer(c(
    weighted_mean.full,
    weighted_se.full,
    unweighted_mean.full,
    unweighted_se.full
  )) %>%
  mutate(weighted = ifelse(str_detect(name, "unweighted"), "unweighted", "weighted")) %>%
  pivot_wider(-c(value), weighted) %>%
  mutate(
    weighted = ifelse(is.na(weighted), lag(weighted, n = 2L), weighted),
    unweighted = ifelse(is.na(unweighted), lead(unweighted, n = 2L), unweighted)
  ) %>%
  filter(str_detect(name, "unweighted")) %>% 
  filter(level != "NA" & level != "Other") %>% 
  select(-name) %>%
  mutate(level = ifelse(1:length(.$level) %% 2 == 0, "", level)) %>%
  add_row(
    variable = "",
    level = "",
    weighted = paste0("n = ", n_full),
    unweighted = paste0("n = ", n_full)
  ) 

# extension sample
browser_summ <- demographic_stats_table %>%
  dplyr::select(
    variable,
    level,
    weighted_mean.browser,
    weighted_se.browser,
    unweighted_mean.browser,
    unweighted_se.browser
  ) %>%
  mutate(
    unweighted_se.browser = paste0("(", as.character(unweighted_se.browser), ")"),
    weighted_se.browser = paste0("(", as.character(weighted_se.browser), ")"),
    weighted_mean.browser = as.character(weighted_mean.browser),
    unweighted_mean.browser  = as.character(unweighted_mean.browser)
  ) %>%
  pivot_longer(
    c(
      weighted_mean.browser,
      weighted_se.browser,
      unweighted_mean.browser,
      unweighted_se.browser
    )
  ) %>%
  mutate(weighted = ifelse(str_detect(name, "unweighted"), "unweighted", "weighted")) %>%
  pivot_wider(-c(value), weighted) %>%
  mutate(
    weighted = ifelse(is.na(weighted), lag(weighted, n = 2L), weighted),
    unweighted = ifelse(is.na(unweighted), lead(unweighted, n = 2L), unweighted)
  ) %>%
  filter(str_detect(name, "unweighted")) %>%
  filter(level != "NA" & level != "Other") %>% 
  select(-name) %>%
  mutate(level = ifelse(1:length(.$level) %% 2 == 0, "", level)) %>%
  add_row(
    variable = "",
    level = "",
    weighted = paste0("n = ", n_browser),
    unweighted = paste0("n = ", n_browser)
  )

# put full and extension sample tables together...
combined_table <- full_summ %>%
  rename(full_weighted = weighted,
         full_unweighted = unweighted) %>%
  bind_cols(
    browser_summ %>%
      rename(
        extension_weighted = weighted,
        extension_unweighted = unweighted
      ) %>% 
      select(-variable, -level)
  )

# print table
custom_kable_weighted(combined_table,
                      col_names = c("", "Full weighted", "Full unweighted", 
                                    "Extension weighted", "Extension unweighted"))

```


## Figure S1: Total participants with browser activity data over time

```{r attrition}
# study period
date_range <- as.Date(c("2020-07-21", "2020-12-31"))
# all dates in this period
date_vector <-
  as.Date(seq(date_range[1], date_range[2], by = "day"))

# count number of unique individuals per day
users_by_mat <-
  matrix(
    data = 0,
    nrow = nrow(activity_data),
    ncol = length(date_vector)
  )
for (i in 1:nrow(activity_data)) {
  start_id <-
    which(date_vector == as.Date(activity_data[i, ]$activity_start_date))
  end_id <-
    which(date_vector == as.Date(activity_data[i, ]$activity_end_date))
  # interpolate between start and end date
  users_by_mat[i, start_id:end_id] <- activity_data[i, ]$weight_cmd
}

# tally total number for each day
n_users_by_date <- tibble(date = date_vector,
                          n_activity_users = colSums(users_by_mat))

# plot
n_users_by_date %>%
  ggplot(aes(x = date, y = n_activity_users)) +
  geom_col(alpha = .8,
           position = position_dodge2(padding = .1)) +
  labs(x = "", y = "Number of participants") +
  theme_minimal()


ggsave('./appendix-figures_files/attrition_wtd.png',
      dpi = 600, width = 10, height = 6)
ggsave('./appendix-figures_files/attrition_wtd.pdf',
      dpi = 600, width = 10, height = 6)
```


Distribution of time in study 

```{r}
# average number of days in study
ggplot(tibble(days_enrolled = rowSums(users_by_mat))) +
  geom_histogram(aes(x = days_enrolled),
                 alpha = .8,
                 bins = length(date_vector)/2) +
  geom_vline(xintercept = median(rowSums(users_by_mat)),
             color = "blue") +
  labs(x = "Days enrolled in study",
       y = "Number of participants") +
  annotate(geom = "text", 
           label = paste0("median = ", median(rowSums(users_by_mat)), " days"),
           angle = 90,
           color = "blue",
           x =  median(rowSums(users_by_mat)) - 2,
           y = 150) +
  theme_minimal()
```


## Figure S2: Coonsumption levels over time by channel type


```{r}
# calculate centered moving averages for each channel type
# views
ma_count <- day_count_averages %>%
  group_by(source) %>%
  mutate(
    ma7_time = rollmean(avg_count_day, k = 7, fill = NA),
    ma14_time = rollmean(avg_count_day, k = 14, fill = NA),
    ma21_time = rollmean(avg_count_day, k = 21, fill = NA),
    ln_time = log(avg_count_day),
    ln_ma14_time = rollapplyr(
      ln_time,
      14,
      mean,
      na.rm = TRUE,
      by = 1,
      partial = TRUE,
      fill = NA
    )
  )
# time elapsed
ma_time <- day_time_averages %>%
  group_by(source) %>%
  mutate(
    ma7_time = rollmean(avg_time_day, k = 7, fill = NA),
    ma14_time = rollmean(avg_time_day, k = 14, fill = NA),
    ma21_time = rollmean(avg_time_day, k = 21, fill = NA),
    ln_time = log(avg_time_day),
    ln_ma14_time = rollapplyr(
      ln_time,
      14,
      mean,
      na.rm = TRUE,
      by = 1,
      partial = TRUE,
      fill = NA
    )
  )

# plots
sma_count_plot <- ma_count %>%
  mutate(source = recode_channel_type_fxn(source)) %>%
  ggplot(aes(x = full_date, y = avg_count_day,
             color = source)) +
  geom_point(alpha = .6) +
  geom_line(
    data =  . %>% filter((month(full_date) >= 8 &
                            lubridate::day(full_date) >= 15) |  month(full_date) >= 9),
    aes(y = ma21_time,
        color = source),
    size = 1
  ) +
  scale_color_manual(
    values = c(
      "Alternative \nchannel" = "#FFA500",
      "Extremist \nchannel" = "#CD5C5C",
      "Mainstream media \nchannel" = "#015CB9",
      "Other \nchannel" = "#E3E6E6"
    ),
    name = ""
  ) +
  labs(x = "",
       y = "Average number of visits to\nchannel videos")  +
  theme_minimal() +
  theme(legend.position = "bottom") +
  guides(linetype = guide_legend(""))

sma_time_plot <- ma_time %>%
  mutate(source = recode_channel_type_fxn(source)) %>%
  ggplot(aes(x = full_date, y = avg_time_day,
             color = source)) +
  geom_point(alpha = .6) +
  geom_line(
    data = . %>% filter((
      month(full_date) >= 8 &
        lubridate::day(full_date) >= 15
    ) |  month(full_date) >= 9),
    aes(y = ma21_time,
        color = source),
    size = 1
  ) +
  scale_color_manual(
    values = c(
      "Alternative \nchannel" = "#FFA500",
      "Extremist \nchannel" = "#CD5C5C",
      "Mainstream media \nchannel" = "#015CB9",
      "Other \nchannel" = "#E3E6E6"
    ),
    name = ""
  ) +
  labs(x = "",
       y = "Average minutes spent on\nchannel videos")  +
  theme_minimal() +
  theme(legend.position = "bottom") +
  guides(linetype = guide_legend(""))


sma_count_plot + sma_time_plot + plot_layout(ncol=1,heights=c(2,1))

ggsave('./appendix-figures_files/trend_by_channel_type_sma.png',
      dpi = 600, width = 10, height = 14)
ggsave('./appendix-figures_files/trend_by_channel_type_sma.pdf',
      dpi = 600, width = 10, height = 14)
```


## Figure S3 and S4: YouTube video diets of individuals who viewed any alternative/extremist channel video

Distribution of time spent per week on channels for anyone who watched an alternative (extremist) channel video over the course of the study. Bars are in descending order of most to least time on alternative (extremist) channel videos.

```{r }
#cumulative time spent on that channel and the cumulative user 
cumsum_fxn <- function (var, data) {
  channel_type <-
      str_replace(str_extract(var, "[a-z]+_all_week$"), "_all_week", "")
  data %>%
    arrange(desc(.data[[var]])) %>%
    mutate(
      users = weight_cmd / sum(.$weight_cmd, na.rm = T),
      views = (.data[[var]] / sum(.data[[var]], na.rm = T)),
      cum_user = cumsum(users),
      cum_views = cumsum(views),
      ln_cum_user = log10(cumsum(users)),
      ln_cum_views = log10(cumsum(views)),
      source = channel_type
    ) %>%
    select(cum_user, cum_views, ln_cum_user, source, caseid, weight_cmd)
}

# dependent variables (time spent per week) by channel type
minutes_activity_time_all_week <- c(
    "minutes_activity_yt_video_time_elapsed_capped_total_alternative_all_week",
    "minutes_activity_yt_video_time_elapsed_capped_total_extremist_all_week",
    "minutes_activity_yt_video_time_elapsed_capped_total_mainstream_all_week",
    "minutes_activity_yt_video_time_elapsed_capped_total_other_all_week"
  )

# calculate for all channel types
concentration_time_user <- map_dfr(minutes_activity_time_all_week, 
                            ~cumsum_fxn(.x, activity_data))

# join with rest of survey data
activity_data_supers <- activity_data %>%
  left_join(
    concentration_time_user %>%
      select(caseid, cum_views,  source) %>%
      pivot_wider(
        names_from = "source",
        names_glue = "cumsum_{source}",
        values_from = "cum_views"
      ),
    by = "caseid"
  ) %>%
  # define a superconsumer as someone who is in the top 80th percentile of watch time per week
  mutate(
    super_alternative = if_else(cumsum_alternative <= .8, 1, 0),
    super_extremist = if_else(cumsum_extremist <= .8, 1, 0),
    super_mainstream = if_else(cumsum_mainstream <= .8, 1, 0)
  )


# anyone who watched at least one alternative
top_time_all_weeks_most_any_alt <- activity_data_supers  %>% 
  filter(at_alt == 1)  %>% 
  arrange(desc(minutes_activity_yt_video_time_elapsed_capped_total_alternative_all_week)) %>% 
  mutate(rank = 1:nrow(.)) %>% 
  pivot_longer(cols = all_of(minutes_activity_time_all_week)) %>% 
  mutate(channel_type = str_extract(str_replace(name, "_all_week$", ""), "[a-z]+$"),
         minutes_value = value,
         image = "https://i.postimg.cc/bwL3hjPY/user-icon-extremist.png" ) 

# anyone who watched at least one extremist
top_time_all_weeks_most_any_ext <- activity_data_supers  %>% 
  filter(at_ext == 1)  %>% 
  arrange(desc(minutes_activity_yt_video_time_elapsed_capped_total_extremist_all_week)) %>% 
  mutate(rank = 1:nrow(.)) %>% 
  pivot_longer(cols = all_of(minutes_activity_time_all_week)) %>% 
  mutate(channel_type = str_extract(str_replace(name, "_all_week$", ""), "[a-z]+$"),
         minutes_value = value,
         image = "https://i.postimg.cc/D0BPKrVj/user-icon-alternative.png" )

topuser_plot(
    data = top_time_all_weeks_most_any_alt,
    figure_size = .02,
    figure_space = -100,
    channel_type = "Alternative",
    title = "Particpants who watched any alternative channel video",
    ylabel = "Minutes per week on YouTube videos",
    y_limit = 3e3
  )
ggsave('./appendix-figures_files/any_alternative_video_diet_time_elapsed_week.png',
      dpi = 600, width = 10, height = 6)
ggsave('./appendix-figures_files/any_alternative_video_diet_time_elapsed_week.pdf',
      dpi = 600, width = 10, height = 6)

topuser_plot(
    data = top_time_all_weeks_most_any_ext,
    figure_size = .02,
    figure_space = -160,
    channel_type = "Extremist",
    title = "Particpants who watched any extremist channel video",
    ylabel = "Minutes per week on YouTube videos",
    y_limit = 3e3
  )
ggsave('./appendix-figures_files/any_extremist_video_diet_time_elapsed_week.png',
      dpi = 600, width = 10, height = 6)
ggsave('./appendix-figures_files/any_extremist_video_diet_time_elapsed_week.pdf',
      dpi = 600, width = 10, height = 6)
```


## Figure S5. YouTube video diets of alternative and extremist superconsumers

Distribution of time spent per week on channels for alternative (extremist) superconsumers. Superconsumers are individuals who make up the top 80th percentile of watch time for the given channel type. Bars are in descending order of most to least time on alternative (extremist) channel videos.


```{r calculate cumulative sums}

# join with rest of survey data
activity_data_supers <- activity_data %>%
  left_join(
    concentration_time_user %>%
      select(caseid, cum_views,  source) %>%
      pivot_wider(
        names_from = "source",
        names_glue = "cumsum_{source}",
        values_from = "cum_views"
      ),
    by = "caseid"
  ) %>%
  # define a superconsumer as someone who is in the top 80th percentile of watch time per week
  mutate(
    super_alternative = if_else(cumsum_alternative <= .8, 1, 0),
    super_extremist = if_else(cumsum_extremist <= .8, 1, 0),
    super_mainstream = if_else(cumsum_mainstream <= .8, 1, 0)
  )


# select the top alternative super consumers
top_time_all_weeks_most_alt <- activity_data_supers  %>%
  filter(super_alternative == 1)  %>%
  arrange(desc(
    minutes_activity_yt_video_time_elapsed_capped_total_alternative_all_week
  )) %>%
  mutate(rank = 1:nrow(.)) %>%
  pivot_longer(cols = all_of(minutes_activity_time_all_week)) %>%
  mutate(
    channel_type = str_replace(name, "_all_week$", ""),
    channel_type = str_extract(channel_type, "[a-z]+$"),
    minutes_value = value,
    image = if_else(
      super_extremist == 1 ,
      "https://i.postimg.cc/bwL3hjPY/user-icon-extremist.png",
      NA_character_
    )
  )

# select the top extremist super consumers
top_time_all_weeks_most_ext <- activity_data_supers  %>%
  filter(super_extremist == 1)  %>%
  arrange(desc(
    minutes_activity_yt_video_time_elapsed_capped_total_extremist_all_week
  )) %>%
  mutate(rank = 1:nrow(.)) %>%
  pivot_longer(cols = all_of(minutes_activity_time_all_week)) %>%
  mutate(
    channel_type = str_replace(name, "_all_week$", ""),
    channel_type = str_extract(channel_type, "[a-z]+$"),
    minutes_value = value,
    image = if_else(
      super_alternative == 1 ,
      "https://i.postimg.cc/D0BPKrVj/user-icon-alternative.png",
      NA_character_
    )
  )

alternative_superconsumers <- activity_data_supers %>% 
  filter(super_alternative == 1) 
extemist_superconsumers <- activity_data_supers %>% 
  filter(super_extremist == 1) 

# alternative superconsumers
alternative_superconsumers_weeks_plot <-
  topuser_plot(
    data = top_time_all_weeks_most_alt,
    figure_space = -160,
    figure_size = .075,
    channel_type = "Alternative",
    title = "Alternative channel superconsumers",
    ylabel = "Minutes per week on YouTube videos",
    y_limit = 3e3
  )
# extremist superconsumers
extremist_superconsumers_weeks_plot <-
  topuser_plot(
    data = top_time_all_weeks_most_ext,
    figure_space = -160,
    figure_size = .075,
    channel_type = "Extremist",
    title = "Extremist channel superconsumers",
    ylabel = "Minutes per week on YouTube videos",
    y_limit = 3e3
  )

# same legend 
common_legend_superconsumers <-
  get_legend(alternative_superconsumers_weeks_plot)

# put panels side by side
plot_grid(
  plot_grid(
    alternative_superconsumers_weeks_plot +
      theme(legend.position = 'none'),
    extremist_superconsumers_weeks_plot +
      theme(legend.position = 'none'),
    labels = c("A", "B"),
    label_size = 16,
    nrow = 1
  ),
  common_legend_superconsumers,
  ncol = 1,
  rel_heights = c(1, .1)
)

ggsave('./appendix-figures_files/superconsumers_time_elapsed_week.png',
      dpi = 600, width = 12, height = 6)
ggsave('./appendix-figures_files/superconsumers_time_elapsed_week.pdf',
      dpi = 600, width = 12, height = 6)
```


## Figure S6. Recommendation follows by video channel type

```{r recommendations followed, results='hide'}
### ===== recommendations followed setup
alternative_followed_table <-
  recs_prop_table_fxn(channel_type = "alternative",
                      statistic = "followed")
extremist_followed_table <-
  recs_prop_table_fxn(channel_type = "extremist",
                      statistic = "followed")
mainstream_followed_table <-
  recs_prop_table_fxn(channel_type = "mainstream",
                      statistic = "followed")
mainstream_followed_table_rounded <- mainstream_followed_table %>%
  mutate(
    prop = case_when(
      variable == "rec_match_alternative_match" ~ .01,
      variable == "rec_match_other_match" ~ 0.295,
      variable == "rec_match_mainstream_match" ~ 0.70,
      TRUE ~ prop
    )
  )
other_followed_table <- 
  recs_prop_table_fxn(channel_type = "other",
                      statistic = "followed")
#outputs to waffle_plot_fxn
alternative_recs_followed <-
  waffle_plot_fxn(alternative_followed_table, statistic = "followed",
                  title = "Alternative channel\nvideos")
extremist_recs_followed <-
  waffle_plot_fxn(extremist_followed_table, statistic = "followed",
                  title = "Extremist channel\nvideos")
mainstream_recs_followed <-
  waffle_plot_fxn(mainstream_followed_table_rounded, statistic = "followed",
                  make_proportional = F,
                  title = "Mainstream media\nchannel videoes")
other_recs_followed <-
  waffle_plot_fxn(other_followed_table, statistic = "followed",
                  title = "Other channel\nvideos")


# bar on top of waffles to show scale
total_recs_followed_table <- recs_data %>%
  filter(variable == "total recs followed") %>%
  pivot_longer(-variable) %>%
  select(-variable) %>%
  mutate(
    overall_percent = 100 * (value / sum(value)),
    cumulative_percent = cumsum(overall_percent)
  )

bar_recs_followed_plot <- scale_plot_fxn(
  data = total_recs_followed_table,
  multiplicative_factor = 8,
  text_positions = c(-3, 4, 12, 100),
  segment_y_lower = 4,
  segment_y_upper = 10,
  statistic = "followed"
)

# create dummy plot for legend
dummy_plot_followed <- alternative_followed_table %>%
  ggplot(aes(fill = factor(followed),
             values = prop * 100))  +
  geom_waffle(
    n_rows = 10,
    size = .5,
    color = "white",
    make_proportional = TRUE,
    radius = unit(5, "pt")
  ) +
  scale_fill_manual(
    values = c(
      'alternative' = "#FFA500",
      'extremist' = "#CD5C5C",
      'mainstream' = "#015CB9",
      'other' = '#E3E6E6'
    ),
    labels = c(
      'Alternative channel\nvideos' ,
      'Extremist channel\nvideos',
      'Mainstream media\nchannel videos',
      'Other channel\nvideos'
    ),
    name = "Recommendations followed to:"
  ) +
  theme(
    plot.title = ggtext::element_markdown(size = 12, hjust = .5),
    legend.position = 'bottom'
  )

common_legend_followed <- lemon::g_legend(dummy_plot_followed)

all_waffles_followed <- gridExtra::arrangeGrob(
  alternative_recs_followed,
  extremist_recs_followed,
  mainstream_recs_followed,
  other_recs_followed,
  nrow = 1
)
```


```{r recommendations followed plot, fig.height = 8}
recs_followed_grid <- plot_grid(
  plot_grid(
    bar_recs_followed_plot,
    all_waffles_followed,
    rel_heights = c(1.2, 1.5),
    labels = c(
      "A) Percentage of total recommendations followed:",
      "B) Recommendations followed when watching:"
    ),
    label_y = c(.9, 1.1),
    label_size = 18,
    hjust = 0,
    nrow = 2
  ),
  common_legend_followed,
  ncol = 1,
  rel_heights = c(1, .1)
)
recs_followed_grid

ggsave('./appendix-figures_files/waffle_recs_followed_wtd.png',
      dpi = 600, width = 14, height = 8)

ggsave('./appendix-figures_files/waffle_recs_followed_wtd.pdf',
      dpi = 600, width = 14, height = 8)
```


## Figure S7: Pages viewed immediately prior to YouTube videos by channel type

(CODE ONLY, SOURCE DATA IS ON SECURE SERVER. See `build.R` under "REFERRERS DATA" for construction of `on_platform_referrers_by_channel.csv`)

```{r }
on_platform_referrers_by_channel_plot <-
  on_platform_referrers_by_channel %>%
  mutate(
    channel_type = case_when(
      channel_type == "alternative" ~ "Alternative channel videos",
      channel_type == "extremist" ~ "Extremist channel videos",
      channel_type == "mainstream" ~ "Mainstream media channel videos",
      channel_type == "other" ~ "Other channel videos"
    )
  ) %>%
  ggplot(
    data = .,
    aes(
      x = str_wrap_factor(reorder(channel_type, order_var), width = 17),
      y = percentage,
      fill = youtube_video_referrers_by_channel_type,
      group = channel_type
    )
  ) +
  geom_col(position = position_dodge2(padding = 0,
                                      width = .95)) +
  geom_linerange(aes(ymin = ci_lwr,
                     ymax = ci_upr),
                 size = .7,
                 position = position_dodge2(width = .90)) +
  labs(x = "", y = "Estimated percentage from preceding link") +
  scale_fill_manual(
    values = c(
      'alternative' = "#FFA500",
      'extremist'  = "#CD5C5C",
      'mainstream'  =  "#015CB9",
      'other' = "#E3E6E6",
      'Non-video on-platform'  =  "#7f7f7f",
      'Off-platform'  =  "#2D2E32"
    ),
    breaks  = c(
      'alternative',
      'other',
      'extremist',
      'Non-video on-platform',
      'mainstream',
      'Off-platform'
    ),
    labels = c(
      'alternative' = "Alternative\nchannel videos",
      'extremist' = "Extremist\nchannel videos",
      'mainstream'  =  "Mainstream media\nchannel videos",
      'other' = "Other\nchannel videos",
      'Non-video on-platform'  = "Non-video \non-platform",
      'Off-platform'  = "Off-platform"
    ),
    name = "Type of preceding link:"
  ) +
  scale_y_continuous(
    limits = c(0, 60),
    labels = paste0(seq(0, 60, 20), "%"),
    breaks = seq(0, 60, 20)
  ) +
  ggthemes::theme_gdocs() +
  theme(
    legend.position = 'bottom',
    legend.direction = 'horizontal',
    panel.grid.major.y = element_line(color = "grey95"),
    panel.grid.major.x = element_blank(),
    axis.text.x = element_text(size = 12),
    plot.background = element_rect(color = "white")
  )

on_platform_referrers_by_channel_plot
ggsave('./appendix-figures_files/on_platform_referrers_by_channel_wtd.png',
      dpi = 600, width = 16, height = 10)
ggsave('./appendix-figures_files/on_platform_referrers_by_channel_wtd.pdf',
      dpi = 600, width = 16, height = 10)
```


[TABLE S2: Correlates of time on YouTube videos by channel type]

## Figure S10 and Table S5: Correlates of YouTube video exposure by channel type (with party controls)

```{r formulas and functions}
# with party ID
f_time_pid_alternative_all <-
  formula(
    activity_yt_video_time_elapsed_capped_total_alternative_all ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race + pid_lean
  )
f_time_pid_extremist_all <-
  formula(
    activity_yt_video_time_elapsed_capped_total_extremist_all  ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race + pid_lean
  )
f_time_pid_mainstream_all <-
  formula(
    activity_yt_video_time_elapsed_capped_total_mainstream_all  ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race + pid_lean
  )
f_visit_pid_alternative_all <-
  formula(
    activity_yt_n_video_alternative_all ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race + pid_lean
  )
f_visit_pid_extremist_all <-
  formula(
    activity_yt_n_video_extremist_all ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race + pid_lean
  )
f_visit_pid_mainstream_all <-
  formula(
    activity_yt_n_video_mainstream_all  ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race  + pid_lean
  )

# (FIRE ) with different rr measure (rr_over_mean_cces)
f_time_fire_alternative_all <-
  formula(
    activity_yt_video_time_elapsed_capped_total_alternative_all ~  rr_over_mean_cces + jw_cts + fem_cts + age + gender + educ2 + race
  )
f_time_fire_extremist_all <-
  formula(
    activity_yt_video_time_elapsed_capped_total_extremist_all  ~  rr_over_mean_cces + jw_cts + fem_cts + age + gender + educ2 + race
  )
f_time_fire_mainstream_all <-
  formula(
    activity_yt_video_time_elapsed_capped_total_mainstream_all  ~  rr_over_mean_cces + jw_cts + fem_cts + age + gender + educ2 + race
  )
f_visit_fire_alternative_all <-
  formula(
    activity_yt_n_video_alternative_all ~  rr_over_mean_cces + jw_cts + fem_cts + age + gender + educ2 + race
  )
f_visit_fire_extremist_all <-
  formula(activity_yt_n_video_extremist_all ~  rr_over_mean_cces + jw_cts + fem_cts + age + gender + educ2 + race)

f_visit_fire_mainstream_all <-
  formula(
    activity_yt_n_video_mainstream_all  ~  rr_over_mean_cces + jw_cts + fem_cts + age + gender + educ2 + race
  )


#FIRE
time_fire_fs <- list(
  f_time_fire_alternative_all,
  f_time_fire_extremist_all,
  f_time_fire_mainstream_all
)
visit_fire_fs <- list(
  f_visit_fire_alternative_all,
  f_visit_fire_extremist_all,
  f_visit_fire_mainstream_all
)

#PID 
time_pid_fs <- list(
  f_time_pid_alternative_all,
  f_time_pid_extremist_all,
  f_time_pid_mainstream_all
)
visit_pid_fs <- list(
  f_visit_pid_alternative_all,
  f_visit_pid_extremist_all,
  f_visit_pid_mainstream_all
)

robust_weighted_quasipoisson <- function(data = activity_data, 
                                         formula,
                                         robust_output = TRUE) {
  fit <- glm(formula, 
             family = quasipoisson(link = "log"),
             offset = log(week),
             data = activity_data,
             weights = weight_cmd) 
  
  results <- fit
  
  if (robust_output == TRUE) {
    cov_mod <- vcovHC(fit, type = "HC0")
    std_err <- sqrt(diag(cov_mod))
    q_val <- qnorm(0.975) # stick to 95% CI
    
    results <- bind_cols(
      predictor = fit$coefficients %>% names(),
      estimate = coef(fit),
      robust_se = std_err,
      z = (coef(fit) / std_err),
      p_val = 1.96 * pnorm(abs(coef(fit) / std_err), lower.tail = FALSE),
      ci_lwr = coef(fit) - q_val  * std_err,
      ci_upr = coef(fit) + q_val  * std_err,
    ) %>%
      mutate(
        stat_sig = if_else(p_val < .05, "*", ""),
        channel_type = str_extract(as.character(formula)[2], "alternative|extremist|mainstream")
      ) 
  }
  return(results)
}

```


```{r qpois estimates}
QP_visit_fire_fit <- list()
QP_time_fire_fit <- list()
QP_visit_pid_fit <- list()
QP_time_pid_fit <- list()

for (i in 1:length(visit_fire_fs)) {
  QP_visit_fire_fit[[i]] <-
    robust_weighted_quasipoisson(formula = visit_fire_fs[[i]])
  QP_time_fire_fit[[i]] <-
    robust_weighted_quasipoisson(formula = time_fire_fs[[i]])
  QP_visit_pid_fit[[i]] <-
    robust_weighted_quasipoisson(formula = visit_pid_fs[[i]])
  QP_time_pid_fit[[i]] <-
    robust_weighted_quasipoisson(formula = time_pid_fs[[i]])
}

names(QP_time_fire_fit) <-
  c("time_fire_alternative_full",
    "time_fire_extremist_full",
    "time_fire_mainstream_full")
names(QP_visit_fire_fit) <-
  c("visit_fire_alternative_full",
    "visit_fire_extremist_full",
    "visit_fire_mainstream_full")
names(QP_time_pid_fit) <-
  c("time_pid_alternative_full",
    "time_pid_extremist_full",
    "time_pid_mainstream_full")
names(QP_visit_pid_fit) <-
  c("visit_pid_alternative_full",
    "visit_pid_extremist_full",
    "visit_pid_mainstream_full")


coef_names <- c("Intercept",
                "Denial of racism",
                "Feeling Jews",
                "Hostile sexism",
                "Age",
                "Male",
                "Some college",
                "Bachelor's degree",
                "Post-grad",
                "Non-white")
```


```{r PID visit and time plot, fig.height = 12}
visit_pid_models <- bind_rows(QP_visit_pid_fit)
pid_visit_plot <- coef_plot(data = visit_pid_models) 

time_pid_models <- bind_rows(QP_time_pid_fit)
pid_time_plot <- coef_plot(data = time_pid_models) 

pid_visit_plot + pid_time_plot + plot_layout(ncol = 1)

ggsave('./appendix-figures_files/qpois_coefficients_pid.png',
      dpi = 600, width = 20, height = 11)
```

```{r PID tables} 
# separately estimate without robust SEs becuase tex reg doesn't like this... :L
QP_visit_pid_fit_nonrob <- QP_time_pid_fit_nonrob <- list()
for (i in 1:length(visit_pid_fs)) {
  QP_visit_pid_fit_nonrob[[i]] <-
    robust_weighted_quasipoisson(formula = visit_pid_fs[[i]],
                                 robust_output = F)
  QP_time_pid_fit_nonrob[[i]] <-
    robust_weighted_quasipoisson(formula = time_pid_fs[[i]],
                                 robust_output = F)
}
robust_SEs <- bind_rows(visit_pid_models, time_pid_models)$robust_se
robust_pvals<- bind_rows(visit_pid_models, time_pid_models)$p_val
texreg(c(QP_visit_pid_fit_nonrob, QP_time_pid_fit_nonrob),
       digits = 2,
       booktabs = TRUE, 
       label = "tab:figa7table",
       caption = "Quasipoisson coefficients for correlates of of views and time per week spent on videos from alternative, extremist, and mainstream media channels. Robust standard errors are in parentheses.",
       # texreg reads models as separate lists
       # manually adding in robust p-vals
       override.pvalues = list(robust_pvals[1:12],
                    robust_pvals[13:24],
                    robust_pvals[25:36],
                    robust_pvals[37:48],
                    robust_pvals[49:60],
                    robust_pvals[61:72]),
       override.se = list(robust_SEs[1:12],
                    robust_SEs[13:24],
                    robust_SEs[25:36],
                    robust_SEs[37:48],
                    robust_SEs[49:60],
                    robust_SEs[61:72]),
       custom.model.names = rep(c("Alternative",
                                 "Extremist",
                                 "Mainstream"),2),
       custom.coef.map =
         list(
            "fem_cts" = "Hostile sexism",
            "rr_cts" = "Racial resentment",
            "jw_cts" = "Feeling Jews",
            "pid_leanDemocrat" = "Democrat",
            "pid_leanRepublican" = "Republican",
            "age" = "Age",
            "genderMale" =  "Male",
            "raceNon-white" = "Non-white",
            "educ2Some college" = "Some college",
            "educ24-year" = "Bachelor's degree",
            "educ2Post-grad" = "Post-grad",
            "(Intercept)" = "Intercept"
         ),
          file = "appendix-figures_files/figureA7.tex")

```



### Tables S6 and S7: Separating Hostile sexism and Racial resentment specifications

```{r hostile sexism} 
# without hostile sexism
f_time_pid_alternative_rr <-
  formula(
    activity_yt_video_time_elapsed_capped_total_alternative_all ~  rr_cts + jw_cts + age + gender + educ2 + race 
  )
f_time_pid_extremist_rr <-
  formula(
    activity_yt_video_time_elapsed_capped_total_extremist_all  ~  rr_cts + jw_cts + age + gender + educ2 + race 
  )
f_time_pid_mainstream_rr <-
  formula(
    activity_yt_video_time_elapsed_capped_total_mainstream_all  ~  rr_cts + jw_cts + age + gender + educ2 + race 
  )
f_visit_pid_alternative_rr <-
  formula(
    activity_yt_n_video_alternative_all ~  rr_cts + jw_cts  + age + gender + educ2 + race
  )
f_visit_pid_extremist_rr <-
  formula(activity_yt_n_video_extremist_all ~  rr_cts + jw_cts  + age + gender + educ2 + race)

f_visit_pid_mainstream_rr <-
  formula(
    activity_yt_n_video_mainstream_all  ~  rr_cts + jw_cts + age + gender + educ2 + race
  )


# without racial resentment
f_time_pid_alternative_hs <-
  formula(
    activity_yt_video_time_elapsed_capped_total_alternative_all ~  fem_cts + jw_cts + age + gender + educ2 + race 
  )
f_time_pid_extremist_hs <-
  formula(
    activity_yt_video_time_elapsed_capped_total_extremist_all  ~  fem_cts + jw_cts + age + gender + educ2 + race 
  )
f_time_pid_mainstream_hs <-
  formula(
    activity_yt_video_time_elapsed_capped_total_mainstream_all  ~  fem_cts + jw_cts + age + gender + educ2 + race 
  )

f_visit_pid_alternative_hs <-
  formula(
    activity_yt_n_video_alternative_all ~  fem_cts + jw_cts + age + gender + educ2 + race 
  )
f_visit_pid_extremist_hs <-
  formula(
    activity_yt_n_video_extremist_all  ~  fem_cts + jw_cts + age + gender + educ2 + race 
  )
f_visit_pid_mainstream_hs <-
  formula(
    activity_yt_n_video_mainstream_all  ~  fem_cts + jw_cts + age + gender + educ2 + race 
  )


time_rr_fs <- list(
  f_time_pid_alternative_rr,
  f_time_pid_extremist_rr,
  f_time_pid_mainstream_rr
)
visit_rr_fs <- list(
  f_visit_pid_alternative_rr,
  f_visit_pid_extremist_rr,
  f_visit_pid_mainstream_rr
)
time_hs_fs <- list(
  f_time_pid_alternative_hs,
  f_time_pid_extremist_hs,
  f_time_pid_mainstream_hs
)
visit_hs_fs <- list(
  f_visit_pid_alternative_hs,
  f_visit_pid_extremist_hs,
  f_visit_pid_mainstream_hs
)


QP_time_rr_fit <- list()
QP_time_hs_fit <- list()
QP_visit_rr_fit <- list()
QP_visit_hs_fit <- list()

for (i in 1:length(time_rr_fs)) {
  QP_time_rr_fit[[i]] <-
    robust_weighted_quasipoisson(formula = time_rr_fs[[i]])
  QP_time_hs_fit[[i]] <-
    robust_weighted_quasipoisson(formula = time_hs_fs[[i]])
  QP_visit_rr_fit[[i]] <-
    robust_weighted_quasipoisson(formula = visit_rr_fs[[i]])
  QP_visit_hs_fit[[i]] <-
    robust_weighted_quasipoisson(formula = visit_hs_fs[[i]])
}

names(QP_time_rr_fit) <-
  c("time_rr_alternative_full",
    "time_rr_extremist_full",
    "time_rr_mainstream_full")
names(QP_time_hs_fit) <-
  c("time_hs_alternative_full",
    "time_hs_extremist_full",
    "time_hs_mainstream_full")
names(QP_visit_rr_fit) <-
  c("visit_rr_alternative_full",
    "visit_rr_extremist_full",
    "visit_rr_mainstream_full")
names(QP_visit_hs_fit) <-
  c("visit_hs_alternative_full",
    "visit_hs_extremist_full",
    "visit_hs_mainstream_full")
```

```{r hs rr tables}
# separately estimate without robust SEs becuase tex reg doesn't like this... :L
QP_time_hs_fit_nonrob <- QP_time_rr_fit_nonrob <- list()
QP_visit_hs_fit_nonrob <- QP_visit_rr_fit_nonrob <- list()
for (i in 1:length(time_hs_fs)) {
  QP_time_hs_fit_nonrob[[i]] <-
    robust_weighted_quasipoisson(formula = time_hs_fs[[i]],
                                 robust_output = F)
  QP_time_rr_fit_nonrob[[i]] <-
    robust_weighted_quasipoisson(formula = time_rr_fs[[i]],
                                 robust_output = F)
  QP_visit_hs_fit_nonrob[[i]] <-
    robust_weighted_quasipoisson(formula = visit_hs_fs[[i]],
                                 robust_output = F)
  QP_visit_rr_fit_nonrob[[i]] <-
    robust_weighted_quasipoisson(formula = visit_rr_fs[[i]],
                                 robust_output = F)
}

time_rr_hs_models <- bind_rows(QP_time_hs_fit[[1]],
                               QP_time_rr_fit[[1]],
                               QP_time_hs_fit[[2]],
                               QP_time_rr_fit[[2]],
                               QP_time_hs_fit[[3]],
                               QP_time_rr_fit[[3]])
robust_SEs <- time_rr_hs_models$robust_se
robust_pvals<- time_rr_hs_models$p_val

# output tex table
texreg(
  list(
    QP_time_hs_fit_nonrob[[1]],
    QP_time_rr_fit_nonrob[[1]],
    QP_time_hs_fit_nonrob[[2]],
    QP_time_rr_fit_nonrob[[2]],
    QP_time_hs_fit_nonrob[[3]],
    QP_time_rr_fit_nonrob[[3]]
  ),
       digits = 2,
       booktabs = TRUE, 
       label = "tab:tablea7.1",
       caption = "Quasipoisson coefficients for correlates of of views and time per week spent on videos from alternative, extremist, and mainstream media channels. Robust standard errors are in parentheses.",
       # texreg reads models as separate lists
       # manually adding in robust p-vals
       override.pvalues = list(robust_pvals[1:9],
                    robust_pvals[10:18],
                    robust_pvals[19:27],
                    robust_pvals[28:36],
                    robust_pvals[37:45],
                    robust_pvals[46:54]),
       override.se = list(robust_SEs[1:9],
                    robust_SEs[10:18],
                    robust_SEs[19:27],
                    robust_SEs[28:36],
                    robust_SEs[37:45],
                    robust_SEs[46:54]),
       custom.model.names = rep(c("Alternative",
                                 "Extremist",
                                 "Mainstream"), each = 2),
       custom.coef.map =
         list(
            "fem_cts" = "Hostile sexism",
            "rr_cts" = "Racial resentment",
            "jw_cts" = "Feeling Jews",
            #"pid_leanDemocrat" = "Democrat",
            #"pid_leanRepublican" = "Republican",
            "age" = "Age",
            "genderMale" =  "Male",
            "raceNon-white" = "Non-white",
            "educ2Some college" = "Some college",
            "educ24-year" = "Bachelor's degree",
            "educ2Post-grad" = "Post-grad",
            "(Intercept)" = "Intercept"
         ),
          file = "appendix-figures_files/additional_regressions_hs_rr_time.tex")


visit_rr_hs_models <- bind_rows(QP_visit_hs_fit[[1]],
                               QP_visit_rr_fit[[1]],
                               QP_visit_hs_fit[[2]],
                               QP_visit_rr_fit[[2]],
                               QP_visit_hs_fit[[3]],
                               QP_visit_rr_fit[[3]])
robust_SEs <- visit_rr_hs_models$robust_se
robust_pvals<- visit_rr_hs_models$p_val

# output tex table
texreg(
  list(
    QP_visit_hs_fit_nonrob[[1]],
    QP_visit_rr_fit_nonrob[[1]],
    QP_visit_hs_fit_nonrob[[2]],
    QP_visit_rr_fit_nonrob[[2]],
    QP_visit_hs_fit_nonrob[[3]],
    QP_visit_rr_fit_nonrob[[3]]
  ),
       digits = 2,
       booktabs = TRUE, 
       label = "tab:tablea7.2",
       caption = "Quasipoisson coefficients for correlates of of views and time per week spent on videos from alternative, extremist, and mainstream media channels. Robust standard errors are in parentheses.",
       # texreg reads models as separate lists
       # manually adding in robust SEs
       override.pvalues = list(robust_pvals[1:9],
                    robust_pvals[10:18],
                    robust_pvals[19:27],
                    robust_pvals[28:36],
                    robust_pvals[37:45],
                    robust_pvals[46:54]),
       override.se = list(robust_SEs[1:9],
                    robust_SEs[10:18],
                    robust_SEs[19:27],
                    robust_SEs[28:36],
                    robust_SEs[37:45],
                    robust_SEs[46:54]),
       custom.model.names = rep(c("Alternative",
                                 "Extremist",
                                 "Mainstream"), each = 2),
       custom.coef.map =
         list(
            "fem_cts" = "Hostile sexism",
            "rr_cts" = "Racial resentment",
            "jw_cts" = "Feeling Jews",
            "age" = "Age",
            "genderMale" =  "Male",
            "raceNon-white" = "Non-white",
            "educ2Some college" = "Some college",
            "educ24-year" = "Bachelor's degree",
            "educ2Post-grad" = "Post-grad",
            "(Intercept)" = "Intercept"
         ),
          file = "appendix-figures_files/additional_regressions_hs_rr_views.tex")
```

### Figure S11 and Table S8: Correlates of exposure to YouTube videos by channel type (alternate racial attitude measure) 

```{r FIRE visit and time plot}
visit_fire_models <- bind_rows(QP_visit_fire_fit)
fire_visit_plot <- coef_plot(data = visit_fire_models)

time_fire_models <- bind_rows(QP_time_fire_fit)
fire_time_plot <- coef_plot(data = time_fire_models)
```

```{r eval = F, include=F}
library(cowplot)
plot_grid(
  ggplot(),
  fire_visit_plot,
  ggplot(),
  fire_time_plot,
  nrow = 4,
  rel_heights = c(.1, 1, .1, 1),
  vjust = 3, 
  labels = c("A: View counts", "", "B: Time elapsed", "")
)

ggsave("appendix-figures_files/qpois_coefficient_FIRE.png",
       dpi = 600, height = 20, width = 11 )
```


```{r FIRE tables} 
# separately estimate without robust SEs becuase tex reg doesn't like this... :L
QP_visit_fire_fit_nonrob <- QP_time_fire_fit_nonrob <- list()
for (i in 1:length(time_fire_fs)) {
  QP_visit_fire_fit_nonrob[[i]] <-
    robust_weighted_quasipoisson(formula = visit_fire_fs[[i]],
                                 robust_output = F)
  QP_time_fire_fit_nonrob[[i]] <-
    robust_weighted_quasipoisson(formula = time_fire_fs[[i]],
                                 robust_output = F)
}
robust_SEs <- bind_rows(visit_fire_models, time_fire_models)$robust_se
robust_pvals <- bind_rows(visit_fire_models, time_fire_models)$p_val

texreg(c(QP_visit_fire_fit_nonrob, QP_time_fire_fit_nonrob),
       digits = 2,
       booktabs = TRUE, 
       label = "tab:figa8table",
       caption = "Quasipoisson coefficients for correlates of of views and time per week spent on videos from alternative, extremist, and mainstream media channels. Robust standard errors are in parentheses.",
       # texreg reads models as separate lists
       # manually adding in robust SEs
       override.pvalues =  list(robust_pvals[1:10],
                    robust_pvals[11:20],
                    robust_pvals[21:30],
                    robust_pvals[31:40],
                    robust_pvals[41:50],
                    robust_pvals[51:60]),
       override.se =  list(robust_SEs[1:10],
                    robust_SEs[11:20],
                    robust_SEs[21:30],
                    robust_SEs[31:40],
                    robust_SEs[41:50],
                    robust_SEs[51:60]),
       custom.model.names = rep(c("Alternative",
                                 "Extremist",
                                 "Mainstream"),2),
       custom.coef.map =
         list(
            "fem_cts" = "Hostile sexism",
            "rr_over_mean_cces" = "Denial of racism",
            "jw_cts" = "Feeling Jews",
            "age" = "Age",
            "genderMale" =  "Male",
            "raceNon-white" = "Non-white",
            "educ2Some college" = "Some college",
            "educ24-year" = "Bachelor's degree",
            "educ2Post-grad" = "Post-grad",
            "(Intercept)" = "Intercept"
         ),
          file = "appendix-figures_files/figureA8.tex")

```



### Factor loadings

```{r}
rr_cces_vars <- activity_data %>%
  select(all_of(c("CC18_422e", "CC18_422f", "CC18_422g", "CC18_422h"))) #classical RR
fire_cces_vars <- activity_data %>%
  select(all_of(c("CC18_422e", "CC18_422f", "CC18_422g", "CC18_422h")),
         all_of(c("CC18_422a", "CC18_422b"))) # overstate 
fem_cces_vars <- activity_data %>%
  select(all_of(c("CC18_422c", "CC18_422d"))) #fem
all_cces_vars <- activity_data  %>%
  select(starts_with(c("CC18_422")))

# pca with varimax rotation
rr_pca_rot <- psych::principal(rr_cces_vars,
                 nfactors = 1, 
                 rotate = 'varimax')
rr_pca_rot
fire_pca_rot <- psych::principal(fire_cces_vars,
                 nfactors = 1, 
                 rotate = 'varimax')
fire_pca_rot
fem_pca_rot <- psych::principal(fem_cces_vars,
                 nfactors = 1, 
                 rotate = 'varimax')
fem_pca_rot
all_pca_rot <- psych::principal(all_cces_vars,
                 nfactors = 2, 
                 rotate = 'varimax')
all_pca_rot

# psych::fa.parallel(all_cces_vars,  fa="pc", n.iter=100,
#             show.legend=FALSE, main="Scree plot with parallel analysis")

#plot all PCs
AMR::ggplot_pca(prcomp(all_cces_vars[complete.cases(all_cces_vars),]))
```


### Correlation between racial and gender resentment

```{r}

# n is 875
cor(activity_data$fem_cts, activity_data$rr_cts, 
    use = "complete.obs",
    method = "pearson")

```


### Model with combined score

```{r}
# combine racial+gender resentment variable
activity_data <- activity_data %>%
  rowwise() %>%
  mutate(rr_fem_cts = mean(c(rr_cts,fem_cts),na.rm=T ) ) %>%
  ungroup()


f_time_rrfem_alternative <-
  formula(
    activity_yt_video_time_elapsed_capped_total_alternative_all ~  rr_fem_cts + jw_cts + age + gender + educ2 + race
  )
f_time_rrfem_extremist <-
  formula(
    activity_yt_video_time_elapsed_capped_total_extremist_all  ~  rr_fem_cts + jw_cts + age + gender + educ2 + race 
  )
f_time_rrfem_mainstream <-
  formula(
    activity_yt_video_time_elapsed_capped_total_mainstream_all  ~  rr_fem_cts + jw_cts + age + gender + educ2 + race 
  )

f_visit_rrfem_alternative <-
  formula(
    activity_yt_n_video_alternative_all ~  rr_fem_cts + jw_cts + age + gender + educ2 + race
  )
f_visit_rrfem_extremist <-
  formula(
    activity_yt_n_video_extremist_all ~  rr_fem_cts + jw_cts + age + gender + educ2 + race 
  )
f_visit_rrfem_mainstream <-
  formula(
    activity_yt_n_video_mainstream_all  ~  rr_fem_cts + jw_cts + age + gender + educ2 + race  
  )

time_rrfem_fs <- list(
  f_time_rrfem_alternative,
  f_time_rrfem_extremist,
  f_time_rrfem_mainstream
)
visit_rrfem_fs <- list(
  f_visit_rrfem_alternative,
  f_visit_rrfem_extremist,
  f_visit_rrfem_mainstream
)

# estimate with and without robust SEs
QP_time_rrfem_fit <- QP_visit_rrfem_fit <- list()
QP_time_rrfem_fit_norob <- QP_visit_rrfem_fit_norob <- list()
for (i in 1:length(time_rrfem_fs)) {
  QP_time_rrfem_fit[[i]] <-
    robust_weighted_quasipoisson(formula = time_rrfem_fs[[i]])
  QP_time_rrfem_fit_norob[[i]] <-
    robust_weighted_quasipoisson(formula = time_rrfem_fs[[i]],
                                 robust_output = F)
  QP_visit_rrfem_fit[[i]] <-
    robust_weighted_quasipoisson(formula = visit_rrfem_fs[[i]])
  QP_visit_rrfem_fit_norob[[i]] <-
    robust_weighted_quasipoisson(formula = visit_rrfem_fs[[i]],
                                 robust_output = F)
}
robust_SEs <- bind_rows(c(QP_visit_rrfem_fit, QP_time_rrfem_fit) )$robust_se
robust_pvals <- bind_rows(c(QP_visit_rrfem_fit, QP_time_rrfem_fit) )$p_val

texreg(c(QP_visit_rrfem_fit_norob, QP_time_rrfem_fit_norob),
       digits = 2,
       booktabs = TRUE, 
       label = "tab:figaxtable",
       caption = "Quasipoisson coefficients for correlates of of views and time per week spent on videos from alternative, extremist, and mainstream media channels. Robust standard errors are in parentheses.",
       # texreg reads models as separate lists
       override.pvalues =  list(robust_pvals[1:9],
                    robust_pvals[10:18],
                    robust_pvals[19:27],
                    robust_pvals[28:36],
                    robust_pvals[37:45],
                    robust_pvals[46:54]),
       override.se =  list(robust_SEs[1:9],
                    robust_SEs[10:18],
                    robust_SEs[19:27],
                    robust_SEs[28:36],
                    robust_SEs[37:45],
                    robust_SEs[46:54]),
       custom.model.names = rep(c("Alternative",
                                 "Extremist",
                                 "Mainstream"),2),
       custom.coef.map =
         list(
            "rr_fem_cts" = "Combined racial resentment and hosile sexism",
            "jw_cts" = "Feeling Jews",
            "age" = "Age",
            "genderMale" =  "Male",
            "raceNon-white" = "Non-white",
            "educ2Some college" = "Some college",
            "educ24-year" = "Bachelor's degree",
            "educ2Post-grad" = "Post-grad",
            "pid_leanDemocrat" = "Democrat",
            "pid_leanRepublican" = "Republican",
            "(Intercept)" = "Intercept"
         ),
          file = "appendix-figures_files/additional_regressions_combined_hs_rr.tex")


```



## Figure S8 and Table S3: Zero-inflated models on correlates of time on YouTube video by channel type

```{r}
# make floats integers for pscl
activity_data <- activity_data %>%
  mutate(
    minutes_activity_yt_video_time_elapsed_capped_total_alternative_all = round(
      minutes_activity_yt_video_time_elapsed_capped_total_alternative_all,
      0
    ),
    minutes_activity_yt_video_time_elapsed_capped_total_extremist_all = round(
      minutes_activity_yt_video_time_elapsed_capped_total_extremist_all,
      0
    ),
    minutes_activity_yt_video_time_elapsed_capped_total_mainstream_all = round(
      minutes_activity_yt_video_time_elapsed_capped_total_mainstream_all,
      0
    )
  )

# formulas
zif_time_alternative_all <-
  formula(
    minutes_activity_yt_video_time_elapsed_capped_total_alternative_all ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race |
      rr_cts + jw_cts + fem_cts + age + gender + educ2 + race
  )
zif_time_extremist_all <-
  formula(
    minutes_activity_yt_video_time_elapsed_capped_total_extremist_all  ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race |
      rr_cts + jw_cts + fem_cts + age + gender + educ2 + race
  )
zif_time_mainstream_all <-
  formula(
    minutes_activity_yt_video_time_elapsed_capped_total_mainstream_all  ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race |
      rr_cts + jw_cts + fem_cts + age + gender + educ2 + race
  )

#toutes
zi_time_fs <- list(
  zif_time_alternative_all,
  zif_time_extremist_all,
  zif_time_mainstream_all
)

ZIP_time_fit <- list()
for (i in 1:length(zi_time_fs)) {
  ZIP_time_fit[[i]] <- pscl::zeroinfl(
    zi_time_fs[[i]],
    data = activity_data,
    weights = weight_cmd,
    offset = log(week),
    dist = "poisson"
  )
  #print(summary(ZIP_time_fit[[i]]))
}

names(ZIP_time_fit) <-
  c("time_alternative_full",
    "time_extremist_full",
    "time_mainstream_full")
```


```{r zip plot, fig.height = 12}
time_plot_labels <- c(
  "alternative" = str_wrap("Minutes/week on alternative channel videos", width = 30),
  "extremist" = str_wrap("Minutes/week on extremist channel videos", width = 30),
  "mainstream" = str_wrap("Minutes/week on mainstream media channel videos", width = 30)
)
robust_estimates_fxn <- function(model) {
  result <- lmtest::coeftest(model, vcov = sandwich(model))
  CIs <- confint(result) # honestly probably better to bootstrap, but w/e just be consistent
  ci_lwr <- CIs[,1]
  ci_upr <- CIs[,2]
  stderrs <- result[,2]
  pvals <- result[,4]
  
  out <- tibble(ci_lwr, 
                ci_upr,
                robust_se = stderrs,
                pval = pvals,
                estimate = result[,1],
                covar = row.names(result)
                ) %>% 
    separate(covar, 
             into = c("component", "predictor"), 
             sep = "_", 
             extra = "merge")
  return( out)
}

zip_coef_plot <- function(coef_df) {
  
  if(unique(coef_df$component) %in% "zero") {
    x_limit <-  c(-3.5, 3)
     x_lab <- "Logit coefficient"
  } else {
    x_limit <- c(-5, 5)
     x_lab <- "Poisson coefficient"
  }
  if( str_detect(deparse(substitute(coef_df)), "visit") ){
    facet_lab <- visit_plot_labels
  } else facet_lab <- time_plot_labels
  
  coef_df %>% 
  ggplot(aes(x = estimate, y = str_wrap_factor(predictor, width = 12) )) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_linerange(aes(xmin = ci_lwr, xmax = ci_upr,
                     color = channel_type),
                 lwd = 8, 
             show.legend = FALSE) +
  geom_point(size = 10, shape = 21, stroke = 1,
             color = "black", fill = "#FFFFFF") +
  geom_text(aes(label = format(round(estimate, 2),nsmall = 2)) ,
            size = 3.5, color = "black") +
  geom_text(aes(label = stat_sig ),
            size = 8, nudge_x = .5, nudge_y = .25) +
  facet_wrap( ~ channel_type,
              labeller = as_labeller(
               facet_lab
              )) +
  scale_color_manual(values = c("alternative" = "#FFA500", 
                                "extremist" = "#CD5C5C", 
                                "mainstream" = "#015CB9")) +
  labs(y = "", 
       x = x_lab
       ) +
  scale_x_continuous(limits = x_limit) +
  theme_bw() + 
  theme(strip.text = element_text(size = 12, hjust = .5),
        strip.background = element_rect(fill = "grey", 
                                        color = "grey"),
        axis.text.y = element_text(size = 12))
}

ZIP_time_df <- map_dfr(1:3, ~robust_estimates_fxn(ZIP_time_fit[[.x]])) %>% 
  mutate(channel_type = c(
    rep("alternative", 20),
    rep("extremist", 20),
    rep("mainstream", 20)
  ),
  stat_sig = case_when(estimate > 0 & ci_lwr > 0 ~ "*",
                              estimate < 0 & ci_upr < 0 ~ "*",
                              TRUE ~ "")) 

ZIP_time_count_df <- ZIP_time_df %>% 
  filter(component == "count")  %>% 
  filter(predictor != "(Intercept)") %>% 
  mutate(predictor = refactor_fxn(recode_fxn(predictor)))
ZIP_time_zero_df <- ZIP_time_df %>% 
  filter(component == "zero") %>% 
  filter(predictor != "(Intercept)") %>% 
  mutate(predictor = refactor_fxn(recode_fxn(predictor)))



ZIP_time_count_plot <- zip_coef_plot(ZIP_time_count_df)
ZIP_time_zero_plot <- zip_coef_plot(ZIP_time_zero_df)

plot_grid(
  ggplot(),
  ZIP_time_zero_plot,
  ggplot(),
  ZIP_time_count_plot,
  nrow = 4,
  rel_heights = c(.1, 1, .1, 1),
  vjust = 3, 
  labels = c("A: Zero component", "", "B: Count component", "")
)

ggsave("appendix-figures_files/zip_coefficient_time.png",
       height = 14, width = 11 )

```


```{r ZIP table}
texreg(ZIP_time_fit,
       digits = 2,
       booktabs = TRUE, 
       label = "tab:figa5table",
       caption = "Zero-inflated Poisson coefficients for correlates of the time per week spent on videos from alternative, extremist, and mainstream media channels. Robust standard errors are in parentheses.",
       # texreg reads models as separate lists
       override.pvalues = list(ZIP_time_df$pval[1:20],
                               ZIP_time_df$pval[21:40],
                               ZIP_time_df$pval[41:60]),
       override.se = list(ZIP_time_df$robust_se[1:20],
                               ZIP_time_df$robust_se[21:40],
                               ZIP_time_df$robust_se[41:60]),
       custom.model.names = c("Alternative",
                              "Extremist",
                              "Mainstream"),
       custom.coef.map =
         list(
           "Count model: (Intercept)" = "Intercept",
           "Count model: rr_cts" = "Racial resentment",
           "Count model: jw_cts" = "Feeling Jews",
           "Count model: fem_cts" = "Hostile sexism",
           "Count model: age" = "Age",
           "Count model: genderMale" = "Male",
           "Count model: educ2Some college" = "Some college",
           "Count model: educ24-year" = "Bachelor's degree",
           "Count model: educ2Post-grad" = "Post-grad",
           "Count model: raceNon-white" = "Non-white",
           "Zero model: (Intercept)" = "Intercept",
           "Zero model: rr_cts" = "Racial resentment",
           "Zero model: jw_cts" = "Feeling Jews",
           "Zero model: fem_cts" = "Hostile sexism",
           "Zero model: age" = "Age",
           "Zero model: genderMale" = "Male",
           "Zero model: educ2Some college" = "Some college",
           "Zero model: educ24-year" = "Bachelor's degree",
           "Zero model: educ2Post-grad" = "Post-grad",
           "Zero model: raceNon-white" = "Non-white"
         ),
       reorder.coef = c(seq(1,20, by=2),
                        seq(2, 20, by=2)),
       file = "appendix-figures_files/figureA5.tex")

```



## Figure S9 and Table S4: Correlates of YouTube video views by channel type

```{r}
# DV is view counts
f_visit_alternative_all <- formula(activity_yt_n_video_alternative_all ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race )
f_visit_extremist_all <- formula(activity_yt_n_video_extremist_all ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race)
f_visit_mainstream_all <- formula(activity_yt_n_video_mainstream_all  ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race )  

main_visit_fs <- list(
  f_visit_alternative_all,
  f_visit_extremist_all,
  f_visit_mainstream_all
)

QP_visit_fit <- list()
for (i in 1:length(main_visit_fs)) {
  QP_visit_fit[[i]] <-
    robust_weighted_quasipoisson(
      formula = main_visit_fs[[i]]
    )
  #print(QP_visit_fit[[i]])
}
names(QP_visit_fit) <- 
  c(
    "visit_alternative_full",
    "visit_extremist_full",
    "visit_mainstream_full"
  )

coef_names <- c("Intercept",
                "Racial resentment",
                "Feeling Jews",
                "Hostile sexism",
                "Age",
                "Male",
                "Some college",
                "Bachelor's degree",
                "Post-grad",
                "Non-white")

visit_models <- bind_rows(QP_visit_fit)
```


```{r visits plot}
visit_plot_labels <- c(
  "alternative" = str_wrap("Views/week to alternative channel videos", width = 30),
  "extremist" = str_wrap("Views/week to extremist channel videos", width = 30),
  "mainstream" = str_wrap("Views/week to mainstream media channel videos", width = 30)
)

visit_models %>% 
  filter(predictor != "(Intercept)") %>% 
  mutate(predictor = refactor_fxn(recode_fxn(predictor))) %>% 
  ggplot(aes(x = estimate, y = str_wrap_factor(predictor, width = 12) )) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_linerange(aes(xmin = ci_lwr, xmax = ci_upr,
                     color = channel_type),
                 size = 9,
             show.legend = FALSE) +
  geom_point(size = 10, shape = 21, stroke = 1,
             color = "black", fill = "#FFFFFF") +
  geom_text(aes(label = round(estimate, 2)),
            size = 3.5, color = "black") +
  geom_text(aes(label = stat_sig ),
            size = 8, nudge_x = .45, nudge_y = .25) +
  facet_wrap( ~ channel_type,
              labeller = as_labeller(
               visit_plot_labels
              )) +
  scale_color_manual(values = c("alternative" = "#FFA500", 
                                "extremist" = "#CD5C5C", 
                                "mainstream" = "#015CB9")) +
  labs(y = "", x = "Quasipoisson coefficient") +
  theme_bw() + 
  theme(strip.text = element_text(size = 12, hjust = .5),
        strip.background = element_rect(fill="grey", color = "grey"),
        axis.text.y = element_text(size = 12))

ggsave("appendix-figures_files/qpois_coefficient_visit.png",
       height = 6, width = 11 )
```


```{r fig a6 table}
# separately estimate without robust SEs becuase tex reg doesn't like this... :L
QP_visit_fit_nonrob <- list()
for (i in 1:length(main_visit_fs)) {
  QP_visit_fit_nonrob[[i]] <-
    robust_weighted_quasipoisson(formula = main_visit_fs[[i]],
                                 robust_output = F)
}

robust_SEs <- visit_models$robust_se
robust_pvals<- visit_models$p_val
texreg(QP_visit_fit_nonrob,
       digits = 2,
       booktabs = TRUE, 
       label = "tab:figa6table",
       caption = "Quasipoisson coefficients for correlates of time per week spent on videos from alternative, extremist, and mainstream media channels. Robust standard errors are in parentheses.",
       # texreg reads models as separate lists
       override.pvalues = list(robust_pvals[1:10],
                    robust_pvals[11:20],
                    robust_pvals[21:30]),
       override.se = list(robust_SEs[1:10],
                    robust_SEs[11:20],
                    robust_SEs[21:30]),
       custom.model.names = c("Alternative",
                                 "Extremist",
                                 "Mainstream"),
       custom.coef.map =
         list(
            "fem_cts" = "Hostile sexism",
            "rr_cts" = "Racial resentment",
            "jw_cts" = "Feeling Jews",
            "age" = "Age",
            "genderMale" =  "Male",
            "raceNon-white" = "Non-white",
            "educ2Some college" = "Some college",
            "educ24-year" = "Bachelor's degree",
            "educ2Post-grad" = "Post-grad",
            "(Intercept)" = "Intercept"
         ),
          file = "appendix-figures_files/figureA6.tex")

```



## Figure S12: Correlation between browser history and activity

```{r}
### --------- specify a bunch of variables we want to look at
channel_types <- c("alternative",
                   "extremist",
                   "mainstream",
                   "other")

# proportion that watched any alt/ext/msm videos
bh_dummy <- c("bh_alternative_any",
              "bh_extremist_any",
              "bh_mainstream_any",
              "bh_other_any")
at_dummy <- c("at_alternative_any",
              "at_extremist_any",
              "at_mainstream_any",
              "at_other_any")
bh_counts <- paste0("browser_history_yt_n_video_",
                    channel_types,
                    "_all")
bh_time <-
  paste0("browser_history_yt_video_time_elapsed_capped_total_",
         channel_types,
         "_all")
at_counts <- paste0("activity_yt_n_video_",
                    channel_types,
                    "_all")
at_time <- paste0("activity_yt_video_time_elapsed_capped_total_",
                  channel_types,
                  "_all")

# number subscribed to alternative
bh_subscribed_counts <- paste0("browser_history_yt_n_video_",
                               channel_types,
                               "_subscribed")
at_subscribed_counts  <-  paste0("activity_yt_n_video_",
                                 channel_types,
                                 "_subscribed")
# time on subscribed to alternative
bh_subscribed_time <- paste0("browser_history_yt_video_time_elapsed_capped_total_",
                               channel_types,
                               "_subscribed")
at_subscribed_time  <-  paste0("activity_yt_video_time_elapsed_capped_total_",
                                 channel_types,
                                 "_subscribed")


# put activity and browser history together
select_activity_data_data <- activity_data %>%
  select(caseid, at_alt, at_ext, at_msm, at_other, activity_n_total,
         all_of(at_counts),
         all_of(at_time),
         all_of(at_subscribed_counts),
         all_of(at_subscribed_time))
select_browser_history_data <- browser_history_data %>%
  select(caseid, bh_alt, bh_ext, bh_msm, bh_other, browser_history_n_total,
         all_of(bh_counts),
         all_of(bh_time),
         all_of(bh_subscribed_counts),
         all_of(bh_subscribed_time))
bh_at_data <-
  full_join(select_activity_data_data, select_browser_history_data, 
            by = "caseid") %>%
  rename( bh_alternative_any = bh_alt, 
         bh_extremist_any = bh_ext,
         bh_mainstream_any = bh_msm,
         bh_other_any = bh_other,
         at_alternative_any = at_alt,
         at_extremist_any = at_ext,
         at_mainstream_any = at_msm,
         at_other_any = at_other)

correlation_fxn <- function(bh_var, at_var, data = bh_at_data) {
  r = cor(x = data[[bh_var]], 
          y = data[[at_var]], 
          use = "complete.obs", method = "pearson")
  out <- tibble(corr = r,
                channel_type = str_extract(at_var, 
                                           paste0(channel_types, collapse = "|") ),
                metric = case_when(
                  # youtube general
                  str_detect(at_var, "_yt_n_total") ~ "total yt",
                  # youtube channel type
                  str_detect(at_var, "any") ~ "binary",
                  str_detect(at_var, "subscribed") &
                    str_detect(at_var, "yt_n_video") ~ "sub count",
                  str_detect(at_var, "subscribed") &
                    str_detect(at_var, "time") ~ "sub time",
                  str_detect(at_var, "yt_n_video") ~ "count",
                  str_detect(at_var, "time") ~ "time"
                ))
  return(out)
}

correlation_plot <- function(bh_var, at_var, data = merged_data) {
  r = cor(x = data[[bh_var]],
          y = data[[at_var]],
          use = "complete.obs", method = "pearson")
  out <- ggplot(aes(x = data[[bh_var]],
                    y = data[[at_var]]),
                data = data) +
    geom_point() +
    geom_abline()
  return(out)
}
```


```{r, fig.height = 8}
# any visits to channel type
corr_binary <- map2(.x = bh_dummy, .y = at_dummy,
     correlation_fxn) %>% 
  bind_rows()
# number of videos
corr_counts <- map2(.x = bh_counts, .y = at_counts,
     correlation_fxn) %>% 
  bind_rows()
# capped time on videos
corr_time <- map2(.x = bh_time, .y = at_time,
     correlation_fxn) %>% 
  bind_rows()

# number subscribed
corr_subs_count <- map2(.x = bh_subscribed_counts, .y = at_subscribed_counts,
     correlation_fxn) %>% 
  bind_rows()
# time subscribed
corr_subs_time <- map2(.x = bh_subscribed_time, .y = at_subscribed_time,
     correlation_fxn) %>% 
  bind_rows()

# youtube correlations
youtube_corr_df <- corr_binary %>% 
  bind_rows(corr_counts) %>% 
  bind_rows(corr_time) %>% 
  bind_rows(corr_subs_count) %>% 
  bind_rows(corr_subs_time) %>% 
  mutate(channel_type = recode_channel_type_fxn(channel_type),
         metric = factor(metric, levels = c(
           "binary",
           "count", "sub count",
           "time", "sub time"
         )))


youtube_channel_type <- youtube_corr_df %>% 
  ggplot(aes(x = channel_type, 
             y = 1,
             fill = corr)) +
  geom_tile(color = "black") + 
  geom_text(aes(label = format(round(corr, 2), nsmall = 2)),
            color = "white", size = 6) +
  scale_fill_gradient2(low = "#FFFFFF", 
                       mid = "#6F9EBE", 
                       high = "#032146",
                       midpoint = .5,
                      limits = c(0, 1),
                      breaks = seq(0, 1, .2)) +
  coord_fixed(ratio = 1/2) +
  facet_wrap(~metric, nrow = 5,
             labeller = labeller(
               metric = c("binary" = "Any views of YouTube videos from channel",
                          "count" = "Total number of views of YouTube videos from channel",
                          "time" = "Seconds elapsed on YouTube videos from channel (capped)",
                          "sub count" = "Total number of views of subscribed channel",
                          "sub time" = "Seconds elapsed on videos from subscribed channel (capped)")
             )) +
  labs(x = "", y = "") +
  theme(axis.text.y = element_blank(),
        panel.grid = element_blank(),
        legend.position = 'bottom',
        strip.text.x = element_text(size = 12, hjust = 0))+
  guides(fill = guide_colourbar(barwidth = 20,
                                barheight = 1,
                                title = ""))
youtube_channel_type 

ggsave("appendix-figures_files/correlations_history_activity_youtube_channel_type.png",
      dpi = 600, height = 8, width = 9 )
```



## Table S9: Differential browsing behavior after install

(CODE ONLY)

```{r, eval=FALSE, ITS}
### combine ALT/EXT/TOTAL browser history user-time data 
# must use browser HISTORY data
alt_ext_time <- bh_alt_time %>% 
  pivot_longer(-user_id, names_to = "date") %>% 
  mutate(source = "alternative") %>% 
  bind_rows(bh_ext_time %>% 
              pivot_longer(-user_id, names_to = "date") %>% 
              mutate(source = "extremist") ) %>% 
  right_join(bh_all_time %>%
              pivot_longer(-user_id, names_to = "date", values_to = "total"),
            by = c("user_id", "date")) %>%
  mutate(full_date = as.Date(date),
         year = 2020,
         month = month(full_date),
         week = week(full_date),
         week_date = as.Date(str_c(week, "-1"), format = "%U-%u"),
         fraction = value/total) %>% 
  group_by(week_date, source) %>% 
  mutate(time_per_week = sum(value)) %>% 
  group_by(month, source) %>% 
  mutate(time_per_month = sum(value)) %>% 
  ungroup()

# merge with yougov data
all_alt_ext_time <- alt_ext_time %>% 
  left_join(merged_data, by = "user_id") %>%
  mutate(install_date = ymd(install_date),
         install_diff = full_date - install_date,  # install date difference
         install_diff_biweek = cut(as.numeric(install_diff), 
                                   breaks = seq(-120, 150, by = 15))) 

# make panel data
its_panel_data <- all_alt_ext_time %>%
  mutate(value = if_else(!is.na(total), replace_na(value, 0), value)) %>%
  group_by(full_date, user_id, .drop = FALSE) %>%
  summarize(fraction_time_per_day = value/total,
            weight_cmd, value, total, install_diff) %>%
  ungroup() %>% 
  mutate(
    installed = if_else(install_diff < 0, 0, 1),
    days_after_install = ifelse(install_diff < 0, 0, install_diff)
  ) %>% 
  distinct(full_date, user_id, .keep_all = T) 

# estimate user/day fixed effects OLS
weighted_FE1 <-
  lm(
    fraction_time_per_day ~ installed + factor(user_id) + factor(full_date),
    data = its_panel_data,
    weights = weight_cmd
  )
# same model but with days after install
weighted_FE2 <-
  lm(
    fraction_time_per_day ~ installed + days_after_install + factor(user_id) + factor(full_date),
    data = its_panel_data,
    weights = weight_cmd
  )

# cluster by user
lmtest::coeftest(weighted_FE1, vcov = sandwich::vcovCL(weighted_FE1,
                                                cluster = ~ user_id,
                                                type = "HC1"))
lmtest::coeftest(weighted_FE2, vcov = sandwich::vcovCL(weighted_FE2,
                                                cluster = ~ user_id,
                                                type = "HC1"))

```



## Figure S13: Differences in perceptions of YouTube between full sample and extension sample

```{r}
### ====== survey measures for those who did and did not install the extension
yg_youtube <-  merged_data %>% 
  select(user_id, weight_cmd, browser_sample,
         starts_with("youtube"),
         -youtube_vid_quality,
         -youtube_freq_use_3,
         -youtube_freq_use)

youtube_vars <- yg_youtube %>% 
  select(starts_with("youtube")) %>% 
  names()

yg_youtube_factor <- yg_youtube %>% 
  mutate(youtube_vid_trust = case_when(youtube_vid_trust == 1 ~ "A great deal",
                                       youtube_vid_trust == 2 ~ "A moderate amount",
                                       youtube_vid_trust == 3 ~ "Not much",
                                       youtube_vid_trust == 4 ~ "Not at all"),
         youtube_info_accurate = case_when(youtube_info_accurate == 1 ~ "All or almost all",
                                           youtube_info_accurate == 2 ~ "Most",
                                           youtube_info_accurate == 3 ~ "Some",
                                           youtube_info_accurate == 4 ~ "Very little",
                                           youtube_info_accurate == 5 ~ "None at all"),
         youtube_vid_bias = case_when(youtube_vid_bias == 1 ~ "Favor liberals",
                                      youtube_vid_bias == 2 ~ "Favor conservatives",
                                      youtube_vid_bias == 3 ~ "Neither",
                                      youtube_vid_bias == 4 ~ "Other"),
         youtube_personalize = case_when(youtube_personalize == 1 ~ "Very accurate",
                                         youtube_personalize == 2 ~ "Somewhat accurate",
                                         youtube_personalize == 3 ~ "Not very accurate",
                                         youtube_personalize == 4 ~ "Not at all accurate"),
         youtube_vid_trust = factor(youtube_vid_trust, levels = c("A great deal", "A moderate amount", "Not much", "Not at all")),
         youtube_info_accurate = factor(youtube_info_accurate, levels = c("All or almost all", "Most", "Some", "Very little", "None at all")),
         youtube_vid_bias = factor(youtube_vid_bias, levels = c("Favor liberals", "Favor conservatives", "Neither", "Other")),
         youtube_personalize = factor(youtube_personalize, levels = c("Very accurate", "Somewhat accurate", "Not very accurate", "Not at all accurate"))) %>% 
  dummy_cols(select_columns = youtube_vars)




svy_object <- svydesign(ids = ~ 1,
                        weights = ~ weight_cmd,
                        data = yg_youtube_factor)

estimates_df <- map_dfr(youtube_vars,
                        function (x) {
                          svyresults <- svyby(
                            ~ get(x),
                            by = ~ browser_sample,
                            design = svy_object,
                            svymean,
                            na.rm = TRUE
                          )
                          remove_str <- c(paste0("browser:", "get(x)"),
                                          paste0("full:", "get(x)"))
                          # technically, devrais utiliser svyciprop, mais la difference est négligeable ici
                          CIs <- confint(svyresults) 
                          ci_df <- CIs %>% 
                            as_tibble() %>% 
                            mutate(name = rownames(CIs)) %>% 
                            mutate(browser_sample = if_else(str_detect(name, "browser\\:"),
                                                            "browser", "full"),
                                   varlevel = stringr::str_replace_all(name,
                                                                   fixed(remove_str[1]),
                                                                   ""),
                                   varlevel = stringr::str_replace_all(varlevel,
                                                                   fixed(remove_str[2]),
                                                                   "")) %>% 
                            mutate(join_name = paste0(browser_sample,varlevel))%>% 
                            rename(ci_lwr = `2.5 %`,
                                   ci_upr = `97.5 %`) %>% 
                            select(join_name, ci_lwr, ci_upr) 
                          
                          df <- as_tibble(svyresults) %>% 
                            pivot_longer(-browser_sample) %>%
                            mutate(stat = if_else(str_detect(name, "se\\."), "std_err", "prop"),
                                   name = stringr::str_replace_all(name, 
                                                                   fixed("get(x)"), 
                                                      ""),
                                   name = stringr::str_replace_all(name, 
                                                                   fixed("se."), 
                                                                   "")) %>%  
                            pivot_wider(names_from = stat) %>%
                            rename(varlevel = name) %>% 
                            mutate(join_name = paste0(browser_sample,varlevel)) %>% 
                            left_join(ci_df, 
                                      by = "join_name")  %>% 
                            mutate(variable = x)
                          return(df)
                        }) %>% 
  #rename(var_level = name) %>%
  mutate(
    percentage = 100 * prop,
    ci_lwr = 100 * ci_lwr,
    ci_upr = 100 * ci_upr
  )


youtube_attitudes_plot <- function(youtube_var) {
  facet_labs <- c(
    "youtube_info_accurate" = "Accuracy of information on YouTube",
    "youtube_personalize" = "Belief YouTube personalizes",
    "youtube_vid_trust" = "Trust in YouTube videos",
    "youtube_vid_bias" = "Perception of YouTube bias"
  )
  
  filtered_estimates_df <- estimates_df %>% 
    filter(variable == youtube_var) %>% 
    mutate(varlevel = str_wrap_factor(factor(varlevel,
                            levels = levels(yg_youtube_factor[[youtube_var]])),
           width = 6),
           browser_sample = factor(browser_sample, levels = c("full", "browser")))
  
  filtered_estimates_df %>% 
    ggplot(aes(x = varlevel, y = percentage,
               ymin = ci_lwr, ymax = ci_upr,
               color = browser_sample)) +
    geom_point(size = 2,
               position = position_dodge(width = .3)) +
    geom_errorbar(position = position_dodge(width = .3),
                  width = 0) + 
    labs(x = "", y = "") +
    scale_y_continuous(limits = c(0, 80), 
                       labels = paste0(seq(0, 80, 20), "%"),
                       breaks = seq(0, 80, 20)) +
    facet_wrap( ~ variable,
                scales = "free_x",
                labeller = labeller(
                  variable = facet_labs
                )) +
    scale_color_manual(labels = c("browser" = "Extension sample",
                                  "full" = "Full sample"),
                       values = c("#F74846",
                                  "#28B0C4"),
                       name = "") +
    ggthemes::theme_gdocs() +
    theme(legend.position = 'bottom',
          legend.direction = 'horizontal',
          strip.text = element_text(size = 14, hjust = 1),
          strip.background =element_rect(fill="grey", color = "grey"),
          panel.grid.major.y = element_line(color = "grey95"),
          panel.grid.major.x = element_blank(),
          axis.text.x = element_text(size = 12),
          plot.background = element_rect(color = "white"))
}

youtube_attitudes_plots <- map(
  1:4, ~youtube_attitudes_plot(youtube_vars[[.x]])  +
    theme(legend.position = 'none')
)
common_legend <- get_legend(youtube_attitudes_plot(youtube_vars[[1]]) )

plot_grid(
  plot_grid(
    youtube_attitudes_plots[[1]],
    youtube_attitudes_plots[[2]],
    youtube_attitudes_plots[[3]],
    youtube_attitudes_plots[[4]],
    nrow = 2
  ),
  common_legend,
  ncol = 1,
  rel_heights = c(1, .1)
)

ggsave("appendix-figures_files/youtube_attitudes.png",
      dpi = 600, height = 6, width = 12 )
```


## Figure S14: Percentage of views to each channel type by video number within session

This repository does not contain session-level data in `merged.tsv`, but code that reproduces session level figures are presented.

```{r s14, eval=FALSE}
# need data where each observation is a youtube action
activity_youtube = read_delim(paste0(data_dir, "youtube/merged.tsv"),
                              delim = "\t")
# only sessions with videos in them
activity_youtube_video <- activity_youtube %>% 
  filter(!is.na(video_id))

# remove users not matched to surveys
restricted_sample <- activity_youtube_video %>% 
  filter(user_id %in% activity_data$user_id) %>% 
  left_join(activity_data %>%
              select(user_id, browser_sample, samplegroup, weight_cmd))

within_session_wtd <- restricted_sample %>% 
  group_by(user_id, yt_session) %>% 
  mutate(yt_n_video_in_session = n(),
         video_index = row_number(yt_session)) %>% 
  ungroup() %>% 
  group_by(video_index) %>%
  mutate(across(c("alternative_match", 
                  "extremist_match", 
                  "mainstream_match", 
                  "other_match"),
                ~.x * weight_cmd)) %>%
  mutate(
    total_video_index = n(),
    fraction_alternative = sum(alternative_match) / total_video_index,
    fraction_extremist = sum(extremist_match) / total_video_index,
    fraction_mainstream = sum(mainstream_match) / total_video_index,
    fraction_other = sum(other_match) / total_video_index
  ) %>% 
  select(video_index, total_video_index, starts_with("fraction_")) %>% 
  pivot_longer(-c("video_index", "total_video_index"),
               names_to = "channel_type", 
               values_to = "proportion") %>% 
  distinct()

video_index_rug <- restricted_sample %>% 
  group_by(user_id, yt_session) %>% 
  mutate(yt_n_video_in_session = n(),
         video_index = row_number(yt_session)) %>% 
  select(video_index)


within_session_wtd %>%
  filter(channel_type != "fraction_other") %>% 
  filter(video_index %in% 1:319) %>% # 99%
  ggplot(aes(x = video_index, 
             y = proportion,
             weight = total_video_index,
             #size = weight_video_index,
             color = channel_type) ) +
  geom_point(alpha = .2) +
  geom_rug(data = video_index_rug %>% filter(video_index %in% 1:319),
           aes(x = video_index, y = 0),
           alpha = .005,
           sides = "b",
           position = position_jitter(width = .15),
           inherit.aes = F) +
  geom_smooth(method = 'loess',
              formula = 'y~x',
              span = .5, # using about a third of data points as window
              # i.e. uses 30% closest points to a given observation for the fit
              aes(fill = channel_type),
              method.args = list(degree = 1),
              alpha = .3) +
  scale_y_continuous(limits = c(0, .1),
                     labels = paste0(seq(0, 10, by = 2), "%"),
                     breaks = seq(0, .1, by = .02)) +
  scale_color_manual(values = c("#FFA500", "#CD5C5C", "#015CB9"),
                     labels = c("Alternative\nchannels",
                                "Extremist\nchannels",
                                "Mainstream\nmedia"),
                     name = "") +
  scale_fill_manual(values = c("#FFA500", "#CD5C5C", "#015CB9"),
                    labels = c("Alternative\nchannels",
                               "Extremist\nchannels",
                               "Mainstream\nmedia"),
                    name = "") +
  labs(x = "Video index within session", 
       y = paste0("Estimated percentage of channel type") ) +
  theme_minimal() +
  theme(legend.position = 'bottom',
        axis.title.x = element_text(hjust = .95))

ggsave("./data/extension_sessions_proportions_channel_type.png",
      dpi = 600, height = 6, width = 12 )
ggsave("./data/extension_sessions_proportions_channel_type.pdf",
      dpi = 600, height = 6, width = 12 )
```


## Figure S15: Percentage of views to each channel type by total session length

```{r s15, eval=FALSE}
user_sessions_at <- read_delim("./Data/extremism/user_summary_sessions_activity.tsv",
                            delim = '\t') 
`# merge sessions data with survey data
user_sessions_at_yg <- user_sessions_at %>% 
  left_join(activity_data %>% select(user_id, caseid_d34, weight_cmd)) %>% 
  filter(!is.na(weight_cmd))

user_sessions_at_yg %>%
  select(all_of(count_vars), yt_n_video, yt_window, weight_cmd) %>%
  mutate(across(c(all_of(count_vars)),
                ~.x*weight_cmd)) %>%
  group_by(yt_n_video) %>%
  summarize_at(.vars = all_of(count_vars),
               .funs = ~ (sum(.) / sum(yt_n_video*weight_cmd)) * 100) %>%
  pivot_longer(-yt_n_video) %>%
  filter(name != 'yt_n_video_other_all') %>%
  ggplot(aes(x = yt_n_video, y = value,
             color = name)) +
  geom_point(alpha = .5) +
  geom_smooth(method = 'loess',
              aes(fill = name),
              method.args = list(degree = 1),
              span = .5)+
  geom_rug(aes(x = yt_n_video,
               y = 0),
           alpha = .3,
           sides = "b",
           #position = position_jitter(width = .15),
           inherit.aes = F) +
  scale_color_manual(values = c("#FFA500", "#CD5C5C", "#015CB9"),
                     labels = c("Alternative\nchannels",
                                "Extremist\nchannels",
                                "Mainstream\nmedia"),
                     name = "") +
  scale_fill_manual(values = c("#FFA500", "#CD5C5C", "#015CB9"),
                     labels = c("Alternative\nchannels",
                                "Extremist\nchannels",
                                "Mainstream\nmedia"),
                     name = "") +
  labs(x = "Number of videos in session (session length)",
       y = 'Average percentage of channel videos viewed',
       caption = "") +
  coord_cartesian(xlim = c(0, session_length_threshold99)) +
  scale_y_continuous(limits = c(-1, 60),
                     breaks = seq(0, 60, 10),
                     labels = paste0(seq(0, 60, 10), "%") )

ggsave("appendix-figures_files/percent_visits_sessions_length99.png",
      dpi = 600, height = 6, width = 12 )
```


## Table S10: External referrers to alternative and extremist channel videos

(`referrers_data` is on a secure server and cannot be made public. See `build.R` for how the dataframe was constructed and for a list of the referrers.)

```{r eval=FALSE}
# external referrers table
total_external_referrers_by_channel_type_wtd <- referrers_data %>% 
  left_join(activity_data %>%
              select(user_id, weight_cmd),
            by = c('user_id_all'= 'user_id')) %>% 
  mutate(is_external_referrer = if_else(!str_detect(predecessor_url, "youtube.com"), 1,0 )) %>% 
  group_by(channel_type, is_external_referrer) %>% 
  summarise(n = sum(weight_cmd, na.rm = T)) %>% 
  filter(is_external_referrer == 1 & channel_type %in% c("alternative", "extremist"))

# calculate proportions
external_referrers_by_domain <- referrers_data %>% 
  left_join(activity_data %>%
              select(user_id, weight_cmd),
            by = c('user_id_all'= 'user_id')) %>% 
  mutate(external_referrer_group = case_when(predecessor_domain %in% external_referrers$main_social ~ "Mainstream social",
                                             predecessor_domain %in% external_referrers$alternative_social ~ "Alternative social",
                                             predecessor_domain %in% external_referrers$search_engine ~ "Search engine social",
                                             predecessor_domain %in% external_referrers$webmail ~ "Webmail",
                                             TRUE ~ NA_character_)) %>%
  filter(!is.na(external_referrer_group) ) %>% 
  group_by(predecessor_domain, channel_type) %>% 
  summarise(n = sum(weight_cmd, na.rm = T)) %>% 
  filter(channel_type %in% c("alternative", "extremist")) %>% 
  mutate(n_external_alternative = total_external_referrers_by_channel_type_wtd[total_external_referrers_by_channel_type_wtd$channel_type == "alternative", ]$n,
         n_external_extremist = total_external_referrers_by_channel_type_wtd[total_external_referrers_by_channel_type_wtd$channel_type == "extremist", ]$n,
         percent = if_else(channel_type == "alternative" ,
                        100*(n/n_external_alternative),
                        100*(n/n_external_extremist))) %>% 
  pivot_wider(names_from = "channel_type",
              values_from = c("percent", "n") ) %>% 
  select(-starts_with("n_")) %>% 
  mutate(across(c("percent_extremist", "percent_alternative"), ~ replace_na(., 0)))

#clean up/format table
external_referrers_by_domain_table <- external_referrers_by_domain %>% 
  mutate(external_referrer_group = case_when(predecessor_domain %in% external_referrers$main_social ~ "Mainstream social",
                                             predecessor_domain %in% external_referrers$alternative_social ~ "Alternative social",
                                             predecessor_domain %in% external_referrers$search_engine ~ "Search engine social",
                                             predecessor_domain %in% external_referrers$webmail ~ "Webmail",
                                             TRUE ~ NA_character_)) %>% 
  mutate(percent_extremist = round(percent_extremist, 3),
         percent_alternative = round(percent_alternative, 3))%>% 
  select(`Referrer type` = external_referrer_group,
         `Preceding domain` = predecessor_domain,
         `Percent to extremist channel` = percent_extremist,
         `Percent to alternative channel` = percent_alternative) %>% 
  arrange(`Referrer type`, `Percent to extremist channel`)

# print out
external_referrers_by_domain_table %>%
  kbl(format = "html",
      digits = 5,
      booktabs = TRUE) 

# save as TEX file
external_referrers_by_domain_table %>%
  kbl(format = "latex",
      digits = 3,
      booktabs = TRUE) %>% 
  readr::write_lines(file = "./appendix-figures_files/external_referrers_by_channel_type_wtd.tex")
```



## Figure S16: Concentration of exposure to alternative and extremist channels (view counts)

```{r ecdf visits}
cumsum_fxn <- function (var, data) {
  channel_type <-
      str_replace(str_extract(var, "[a-z]+_all$"), "_all", "")
  data %>%
    arrange(desc(.data[[var]])) %>%
    mutate(
      users = weight_cmd / sum(.$weight_cmd, na.rm = T),
      views = (.data[[var]] / sum(.data[[var]], na.rm = T)),
      cum_user = cumsum(users),
      cum_views = cumsum(views),
      ln_cum_user = log10(cumsum(users)),
      ln_cum_views = log10(cumsum(views)),
      source = channel_type
    ) %>%
    select(cum_user, cum_views, ln_cum_user, source, caseid, weight_cmd)
}

lab_stat <-
  cumsum_fxn('activity_yt_n_video_alternative_all', activity_data) %>%
  filter(round(cum_views, 2) == .80) %>%
  pull(ln_cum_user)
at_counts <- paste0("activity_yt_n_video_",
                    channel_types,
                    "_all")
at_cum_count_user <- bind_rows(map(at_counts, 
                                   ~ cumsum_fxn(.x, activity_data)))

count_cumsum_plot_inset <- at_cum_count_user %>%
  ggplot(aes(x = ln_cum_user, y = cum_views, color = source)) +
  geom_line(size = 2) +
  geom_point(
    aes(x = lab_stat, y = .8),
    color = 'black',
    shape = 4,
    stroke = 1,
    size = 1.5
  ) +
  scale_y_continuous(labels = scales::percent) +
  #scale_x_continuous(labels = scales::percent) +
  scale_color_manual(
    name = "",
    labels = c(
      "Alternative\nchannels",
      "Extremist\nchannels",
      "Mainstream media\nchannels",
      "Other\nchannels"
    ),
    values = c("#FFA500", "#CD5C5C", "#015CB9", "#E3E6E6")
  ) +
  geom_segment(aes(
    x = -4,
    xend = lab_stat,
    y = .8,
    yend = .8
  ),
  lty = 2,
  color = 'black') +
  geom_segment(aes(
    x = lab_stat,
    xend = lab_stat,
    y = 0,
    yend = .8
  ),
  lty = 2,
  color = 'black') +
  geom_curve(
    aes(
      x = -2.5,
      xend = lab_stat,
      y = .9,
      yend = .8
    ),
    #arrow = arrow(type = "closed", length = unit(0.03, "npc")),
    curvature = 0.25,
    angle = 35,
    color = 'black'
  ) +
  scale_x_continuous(
    limits = c(-4, 0),
    breaks = seq(-4, 0,
                 by = 1),
    labels = paste0((round(10 ^ (
      seq(-4, 0,
          by = 1)
    ), 3)) * 100, "%")
  ) +
  annotate(
    "text",
    x = -3,
    y = .95,
    label = str_wrap(paste0(
      format(round(10^(lab_stat), 3) * 100, nsmall = 1),
      "% of users account for 80% of visits to alternative channel videos."
    ), width = 45),
    size = 2.5
  ) +
  labs(x = expression(log[10]~ "(Percentage of users)"), 
       y = "Percentage of total exposure (views)") +
  theme_minimal() +
  #title = "Total percent alternative/extremist videos watched by user percentile"
  theme(
    plot.background = element_blank(),
    legend.position = 'bottom',
    axis.title.x = element_text(size = 10),
    axis.title.y = element_text(size = 10)
  );count_cumsum_plot_inset


count_cumsum_plot_zoomout <- at_cum_count_user %>%
  ggplot(aes(x = cum_user, y = cum_views, color = source)) +
  geom_line(size = 2) +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::percent) +
  scale_color_manual(
    name = "",
    labels = c(
      "Alternative\nchannels",
      "Extremist\nchannels",
      "Mainstream media\nchannels",
      "Other\nchannels"
    ),
    values = c("#FFA500", "#CD5C5C", "#015CB9", "#E3E6E6")
  ) +
  annotate(geom = "rect", col = "black", fill = "white",
           lwd = 2,
           ymin = 0, ymax = .87, xmin = .28, xmax = 1) +
  labs(x = "Percentage of users", y = "Percentage of total exposure (views)") +
  theme_minimal() +
  #title = "Total percent alternative/extremist videos watched by user percentile"
  theme(
    plot.background = element_blank(),
    legend.position = 'bottom',
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12)
  )


count_cumsum_plot_zoomout +
  patchwork::inset_element(count_cumsum_plot_inset +
                             guides(color = "none"), 
                           left = 0.3, bottom = 0.05, right = .95, top = .85,
                           on_top = TRUE)

ggsave(
  "appendix-figures_files/cdf_users_visit_exposure.pdf",
  width = 8.5,
  height = 5.5
)
```


